import os
import re

import mindspore.dataset.text as text
from mindspore.dataset.text import JiebaMode

# 中文采用结巴分词
jieba_dict_path = r"E:\mindspore\tests\ut\data\dataset\jiebadict"
jieba_hmm_file = os.path.join(jieba_dict_path, "hmm_model.utf8")
jieba_mp_file = os.path.join(jieba_dict_path, "jieba.dict.utf8")

# 中文分词器
cn_tokenizer_op = text.JiebaTokenizer(jieba_hmm_file, jieba_mp_file, mode=JiebaMode.HMM, with_offsets=False)


# 英文分词器
def en_tokenize_op(sentence: str):
    sentence = sentence.rstrip()
    return [tok.lower() for tok in re.findall(r'\w+|[^\w\s]', sentence)]


# 分词测试举例说明
def test_tokenizer():
    # 中文分词测试
    sentence = "今天天气不错，希望明天也不会下雨。"
    tokenized_sentence = cn_tokenizer_op(sentence)
    print(tokenized_sentence)

    # 英文分词测试
    sentence = "The quick brown fox jumps over the lazy dog."
    tokenized_sentence = en_tokenize_op(sentence)
    print(tokenized_sentence)


test_tokenizer()
